suppressMessages(library(tidyverse))
suppressMessages(library(stringr))
#suppressMessages(library(ISLR))
suppressMessages(library(caret))
suppressMessages(library(doMC))
#suppressMessages(library(plotly))
#suppressMessages(library(stringr))
registerDoMC(cores=4)
feature_vectors_cleaned
sp wp wnp snp ds dm dl ss
1 0.023980324 0.02910432 0.12174626 0.324861652 0.441483911 0.046320967 0.01229760 0.32035253
2 0.480492813 0.01642710 0.00000000 0.000000000 0.490759754 0.010266940 0.00000000 0.00000000
3 0.384615385 0.00000000 0.00000000 0.000000000 0.000000000 0.000000000 0.53846154 0.00000000
4 0.000000000 0.01886792 0.18867925 0.264150943 0.000000000 0.037735849 0.47169811 0.00000000
5 0.000000000 0.00000000 0.31250000 0.062500000 0.000000000 0.000000000 0.50000000 0.00000000
6 0.307692308 0.00000000 0.07692308 0.000000000 0.000000000 0.000000000 0.53846154 0.00000000
7 0.009990838 0.02277388 0.07490947 0.392260373 0.438026264 0.028532787 0.03346276 0.16042058
8 0.054333765 0.04786546 0.29495472 0.100905563 0.477360931 0.000000000 0.02328590 0.00000000
9 0.497124076 0.00000000 0.00000000 0.001643385 0.000000000 0.500410846 0.00000000 0.00000000
10 0.384615385 0.00000000 0.00000000 0.000000000 0.000000000 0.000000000 0.53846154 0.00000000
11 0.041198502 0.02496879 0.29588015 0.136079900 0.459425718 0.013732834 0.02746567 0.00000000
12 0.384615385 0.00000000 0.00000000 0.000000000 0.000000000 0.000000000 0.53846154 0.00000000
13 0.000000000 0.09523810 0.23809524 0.095238095 0.000000000 0.000000000 0.52380952 0.00000000
14 0.384615385 0.00000000 0.00000000 0.000000000 0.000000000 0.000000000 0.53846154 0.00000000
15 0.384615385 0.00000000 0.00000000 0.000000000 0.000000000 0.000000000 0.53846154 0.00000000
16 0.000000000 0.00000000 0.00000000 0.307692308 0.000000000 0.000000000 0.46153846 0.00000000
17 0.036755387 0.04562738 0.29531052 0.120405577 0.470215463 0.000000000 0.03041825 0.00000000
18 0.200000000 0.00000000 0.00000000 0.000000000 0.000000000 0.000000000 0.60000000 0.00000000
19 0.277372263 0.01459854 0.13868613 0.058394161 0.394160584 0.000000000 0.10948905 0.39416058
20 0.254901961 0.00000000 0.17647059 0.039215686 0.000000000 0.509803922 0.00000000 0.00000000
21 0.301886792 0.00000000 0.16981132 0.000000000 0.000000000 0.509433962 0.00000000 0.00000000
22 0.320754717 0.00000000 0.15094340 0.000000000 0.000000000 0.509433962 0.00000000 0.00000000
23 0.030769231 0.03216783 0.30909091 0.125874126 0.488111888 0.000000000 0.01258741 0.00000000
24 0.002197802 0.03296703 0.09890110 0.362637363 0.501098901 0.000000000 0.00000000 0.42417582
25 0.307692308 0.00000000 0.07692308 0.000000000 0.000000000 0.000000000 0.53846154 0.00000000
26 0.030476190 0.08380952 0.05904762 0.323809524 0.007619048 0.415238095 0.07809524 0.00000000
27 0.039783002 0.04882459 0.09222423 0.316455696 0.007233273 0.408679928 0.08499096 0.00000000
28 0.307692308 0.00000000 0.07692308 0.000000000 0.000000000 0.000000000 0.53846154 0.00000000
29 0.043586550 0.03237858 0.30510585 0.117061021 0.481942715 0.000000000 0.01867995 0.00000000
30 0.000000000 0.02985075 0.32835821 0.119402985 0.000000000 0.000000000 0.50746269 0.00000000
31 0.028985507 0.07246377 0.23188406 0.144927536 0.000000000 0.000000000 0.50724638 0.00000000
32 0.040404040 0.04617605 0.28571429 0.125541126 0.470418470 0.002886003 0.02741703 0.00000000
33 0.076923077 0.00000000 0.30769231 0.000000000 0.000000000 0.000000000 0.53846154 0.00000000
34 0.181818182 0.18181818 0.00000000 0.000000000 0.000000000 0.000000000 0.54545455 0.00000000
35 0.012606169 0.03719267 0.11569066 0.334376397 0.451497541 0.033616451 0.01493071 0.31282968
36 0.000000000 0.23076923 0.07692308 0.076923077 0.000000000 0.000000000 0.53846154 0.00000000
37 0.012987013 0.05194805 0.22077922 0.194805195 0.000000000 0.000000000 0.50649351 0.00000000
38 0.363636364 0.00000000 0.00000000 0.000000000 0.000000000 0.000000000 0.54545455 0.00000000
39 0.000000000 0.09836066 0.13114754 0.229508197 0.000000000 0.049180328 0.44262295 0.00000000
40 0.000000000 0.00000000 0.01666667 0.450000000 0.000000000 0.000000000 0.50000000 0.00000000
41 0.043478261 0.04347826 0.00000000 0.304347826 0.000000000 0.130434783 0.34782609 0.00000000
42 0.311475410 0.00000000 0.16393443 0.000000000 0.000000000 0.508196721 0.00000000 0.00000000
43 0.071428571 0.00000000 0.07142857 0.214285714 0.000000000 0.071428571 0.42857143 0.00000000
44 0.363636364 0.00000000 0.00000000 0.000000000 0.090909091 0.000000000 0.45454545 0.09090909
45 0.066666667 0.00000000 0.00000000 0.266666667 0.000000000 0.133333333 0.33333333 0.00000000
46 0.000000000 0.00000000 0.00000000 0.250000000 0.125000000 0.250000000 0.12500000 0.00000000
47 0.041666667 0.04166667 0.08333333 0.250000000 0.000000000 0.000000000 0.50000000 0.00000000
48 0.000000000 0.02941176 0.11764706 0.294117647 0.264705882 0.235294118 0.00000000 0.00000000
49 0.000000000 0.04166667 0.08333333 0.291666667 0.000000000 0.000000000 0.50000000 0.00000000
50 0.000000000 0.00000000 0.00000000 0.250000000 0.000000000 0.000000000 0.50000000 0.00000000
51 0.000000000 0.07692308 0.07692308 0.269230769 0.000000000 0.076923077 0.42307692 0.00000000
52 0.000000000 0.00000000 0.00000000 0.300000000 0.000000000 0.000000000 0.50000000 0.00000000
53 0.000000000 0.00000000 0.00000000 0.250000000 0.250000000 0.250000000 0.00000000 0.00000000
54 0.000000000 0.00000000 0.04166667 0.375000000 0.291666667 0.083333333 0.12500000 0.00000000
55 0.000000000 0.00000000 0.00000000 0.357142857 0.500000000 0.000000000 0.00000000 0.00000000
56 0.000000000 0.00000000 0.09375000 0.343750000 0.000000000 0.062500000 0.43750000 0.00000000
57 0.000000000 0.00000000 0.00000000 0.375000000 0.000000000 0.000000000 0.50000000 0.00000000
58 0.009523810 0.07619048 0.07619048 0.323809524 0.000000000 0.000000000 0.50476190 0.00000000
59 0.000000000 0.00000000 0.00000000 0.300000000 0.200000000 0.100000000 0.20000000 0.00000000
60 0.000000000 0.00000000 0.00000000 0.250000000 0.000000000 0.250000000 0.25000000 0.00000000
61 0.000000000 0.05555556 0.05555556 0.277777778 0.000000000 0.000000000 0.50000000 0.00000000
62 0.000000000 0.00000000 0.18518519 0.277777778 0.000000000 0.000000000 0.50000000 0.00000000
63 0.000000000 0.00000000 0.06250000 0.250000000 0.000000000 0.000000000 0.43750000 0.00000000
64 0.000000000 0.08571429 0.17142857 0.200000000 0.171428571 0.200000000 0.14285714 0.00000000
65 0.000000000 0.20000000 0.00000000 0.000000000 0.000000000 0.000000000 0.60000000 0.00000000
66 0.000000000 0.14285714 0.00000000 0.000000000 0.000000000 0.000000000 0.42857143 0.00000000
sm sl modelsize class subclass port proto
1 0.17954499 2.049600e-04 4879 Normal-UDP-DNS normal 53 udp
2 0.00000000 5.010267e-01 487 Normal-TCP-HTTP normal 80 tcp
3 0.00000000 5.384615e-01 13 Normal-TCP-HTTP normal 443 tcp
4 0.13207547 3.773585e-01 53 Normal-TCP-HTTP normal 80 tcp
5 0.06250000 4.375000e-01 16 Normal-TCP-HTTP normal 80 tcp
6 0.15384615 3.846154e-01 13 Normal-UDP-NTP-server normal 123 udp
7 0.33933947 2.617687e-04 22921 Normal-UDP-DNS normal 53 udp
8 0.49547219 5.174644e-03 773 Normal-TCP-HTTP normal 80 tcp
9 0.00000000 5.004108e-01 1217 Normal-TCP-HTTP normal 80 tcp
10 0.07692308 4.615385e-01 13 Normal-TCP-Jabber normal 5222 tcp
11 0.47815231 2.247191e-02 801 Normal-TCP-HTTP normal 80 tcp
12 0.00000000 5.384615e-01 13 Normal-TCP-Jabber normal 5222 tcp
13 0.04761905 4.761905e-01 21 Normal-TCP-HTTP normal 443 tcp
14 0.07692308 4.615385e-01 13 Normal-TCP-HTTP normal 443 tcp
15 0.00000000 5.384615e-01 13 Normal-TCP-HTTP normal 443 tcp
16 0.30769231 1.538462e-01 13 Normal-TCP-HTTP normal 80 tcp
17 0.49176172 8.871990e-03 789 Normal-TCP-HTTP normal 80 tcp
18 0.20000000 4.000000e-01 5 Normal-TCP-MSN normal 1863 tcp
19 0.04379562 6.569343e-02 137 Normal-UDP-NTP-server normal 123 udp
20 0.00000000 5.098039e-01 51 Normal-TCP-HTTP normal 443 tcp
21 0.00000000 5.094340e-01 53 Normal-TCP-HTTP normal 443 tcp
22 0.00000000 5.094340e-01 53 Normal-TCP-HTTP normal 443 tcp
23 0.49790210 2.797203e-03 715 Normal-TCP-HTTP normal 80 tcp
24 0.07692308 0.000000e+00 455 Normal-TCP-HTTP-CVUT-WebServer normal 53 udp
25 0.07692308 4.615385e-01 13 Normal-UDP-NTP-server normal 123 udp
26 0.00000000 5.009524e-01 525 Normal-TCP-HTTP normal 443 tcp
27 0.00000000 5.009042e-01 553 Normal-TCP-HTTP normal 443 tcp
28 0.07692308 4.615385e-01 13 Normal-UDP-NTP-server normal 123 udp
29 0.49439601 6.226650e-03 803 Normal-TCP-HTTP normal 80 tcp
30 0.00000000 5.074627e-01 67 Normal-TCP-HTTP normal 443 tcp
31 0.00000000 5.072464e-01 69 Normal-TCP-HTTP normal 443 tcp
32 0.48629149 1.443001e-02 693 Normal-TCP-HTTP normal 80 tcp
33 0.07692308 4.615385e-01 13 Normal-UDP-NTP-server normal 123 udp
34 0.00000000 5.454545e-01 11 Normal-UDP-NTP-server normal 123 udp
35 0.18712561 8.940545e-05 11185 Normal-UDP-DNS normal 53 udp
36 0.00000000 5.384615e-01 13 Normal-TCP-IMAP normal 993 tcp
37 0.00000000 5.064935e-01 77 Normal-TCP-HTTP normal 443 tcp
38 0.54545455 0.000000e+00 11 Normal-UDP-NTP-server normal 123 udp
39 0.36065574 1.311475e-01 61 Normal-TCP-HTTP normal 80 tcp
40 0.46666667 3.333333e-02 60 Normal-TCP-HTTP normal 80 tcp
41 0.17391304 3.043478e-01 23 Normal-TCP-HTTP normal 80 tcp
42 0.00000000 5.081967e-01 61 Normal-TCP-HTTP normal 443 tcp
43 0.00000000 5.000000e-01 14 Normal-TCP-HTTP normal 443 tcp
44 0.45454545 0.000000e+00 11 Normal-UDP-NTP-server normal 123 udp
45 0.00000000 4.666667e-01 15 Normal-TCP-HTTP normal 80 tcp
46 0.12500000 3.750000e-01 8 Normal-TCP-HTTP normal 80 tcp
47 0.00000000 5.000000e-01 24 Normal-TCP-HTTP normal 80 tcp
48 0.17647059 3.235294e-01 34 Normal-TCP-HTTP normal 80 tcp
49 0.20833333 2.916667e-01 24 Normal-TCP-HTTP normal 80 tcp
50 0.25000000 2.500000e-01 8 Normal-TCP-HTTP normal 80 tcp
51 0.03846154 4.615385e-01 26 Normal-TCP-HTTP normal 80 tcp
52 0.30000000 2.000000e-01 10 Normal-TCP-HTTP normal 80 tcp
53 0.12500000 3.750000e-01 8 Normal-TCP-HTTP normal 80 tcp
54 0.12500000 3.750000e-01 24 Normal-TCP-HTTP normal 80 tcp
55 0.00000000 5.000000e-01 14 Normal-TCP-HTTP normal 80 tcp
56 0.12500000 3.750000e-01 32 Normal-TCP-HTTP normal 80 tcp
57 0.12500000 3.750000e-01 16 Normal-TCP-HTTP normal 80 tcp
58 0.07619048 4.285714e-01 105 Normal-TCP-HTTP normal 80 tcp
59 0.20000000 3.000000e-01 10 Normal-TCP-HTTP normal 80 tcp
60 0.25000000 2.500000e-01 8 Normal-TCP-HTTP normal 80 tcp
61 0.11111111 3.888889e-01 18 Normal-TCP-HTTP normal 80 tcp
62 0.29629630 2.037037e-01 54 Normal-TCP-HTTP normal 80 tcp
63 0.18750000 2.500000e-01 16 Normal-TCP-HTTP normal 80 tcp
64 0.14285714 3.714286e-01 35 Normal-TCP-HTTP normal 80 tcp
65 0.00000000 6.000000e-01 5 Normal-TCP-HTTP normal 80 tcp
66 0.00000000 4.285714e-01 7 Normal-TCP-HTTP normal 80 tcp
[ reached 'max' / getOption("max.print") -- omitted 8922 rows ]
feature_vectors_cleaned.bkp <- feature_vectors_cleaned
feature_vectors_cleaned %>% group_by(class) %>% summarise(n=n()) %>% arrange(desc(n))
# A tibble: 47 x 2
class n
<fct> <int>
1 Botnet-TCP-SMTP-Attempt-SPAM 4297
2 Normal-TCP-HTTP 2602
3 Botnet-TCP-HTTP-Established-Ad 487
4 Botnet-TCP-HTTP-Established 377
5 Botnet-UDP-DNS 301
6 Botnet-TCP-HTTP-Established-SSL 210
7 Botnet-TCP-HTTP-CC-Not-Encrypted 68
8 Normal-UDP-NTP-server 67
9 Botnet-TCP-HTTPS-Established-Microsoft 58
10 Botnet-TCP-HTTP-Google-Net-Established 50
# ... with 37 more rows
feature_vectors_cleaned_aux_botnet <- feature_vectors_cleaned %>% filter(class == 'Botnet-TCP-SMTP-Attempt-SPAM')
feature_vectors_cleaned_aux_normal <- feature_vectors_cleaned %>% filter(class == 'Normal-TCP-HTTP')
feature_vectors_cleaned_aux_botnet
sp wp wnp snp ds dm dl ss
1 0.097560976 0.14634146 0.21951220 0.00000000 0.000000000 0.5121951 0.000000000 0.0000000
2 0.000000000 0.00000000 0.20000000 0.00000000 0.000000000 0.6000000 0.000000000 0.6000000
3 0.074074074 0.03703704 0.33333333 0.00000000 0.000000000 0.5185185 0.000000000 0.5185185
4 0.000000000 0.00000000 0.00000000 0.20000000 0.200000000 0.4000000 0.000000000 0.2000000
5 0.000000000 0.03597122 0.28057554 0.17266187 0.000000000 0.4964029 0.007194245 0.5035971
6 0.000000000 0.00000000 0.16666667 0.16666667 0.000000000 0.5000000 0.000000000 0.0000000
7 0.000000000 0.00000000 0.20000000 0.00000000 0.000000000 0.6000000 0.000000000 0.6000000
8 0.000000000 0.00000000 0.20000000 0.00000000 0.200000000 0.4000000 0.000000000 0.6000000
9 0.000000000 0.00000000 0.16666667 0.00000000 0.000000000 0.5000000 0.000000000 0.5000000
10 0.000000000 0.00000000 0.28571429 0.00000000 0.000000000 0.5714286 0.000000000 0.5714286
11 0.000000000 0.00000000 0.00000000 0.20000000 0.000000000 0.6000000 0.000000000 0.4000000
12 0.000000000 0.00000000 0.20000000 0.00000000 0.000000000 0.6000000 0.000000000 0.0000000
13 0.000000000 0.00000000 0.00000000 0.14285714 0.142857143 0.2857143 0.000000000 0.4285714
14 0.000000000 0.00000000 0.00000000 0.20000000 0.000000000 0.6000000 0.000000000 0.6000000
15 0.000000000 0.00000000 0.00000000 0.14285714 0.000000000 0.4285714 0.000000000 0.4285714
16 0.000000000 0.20000000 0.00000000 0.00000000 0.000000000 0.6000000 0.000000000 0.0000000
17 0.000000000 0.00000000 0.16666667 0.00000000 0.000000000 0.5000000 0.000000000 0.5000000
18 0.000000000 0.00000000 0.00000000 0.00000000 0.000000000 0.4000000 0.000000000 0.4000000
19 0.006960557 0.08352668 0.26914153 0.13689095 0.000000000 0.4988399 0.002320186 0.5011601
20 0.000000000 0.00000000 0.20000000 0.00000000 0.000000000 0.6000000 0.000000000 0.6000000
21 0.000000000 0.00000000 0.20000000 0.00000000 0.200000000 0.4000000 0.000000000 0.6000000
22 0.000000000 0.00000000 0.00000000 0.00000000 0.000000000 0.4000000 0.000000000 0.4000000
23 0.000000000 0.00000000 0.00000000 0.14285714 0.000000000 0.4285714 0.000000000 0.4285714
24 0.000000000 0.00000000 0.14285714 0.14285714 0.142857143 0.4285714 0.000000000 0.5714286
25 0.007407407 0.04444444 0.27407407 0.16296296 0.022222222 0.4814815 0.000000000 0.5037037
26 0.015267176 0.03816794 0.30534351 0.12977099 0.022900763 0.4809160 0.000000000 0.5038168
27 0.000000000 0.02127660 0.29787234 0.14893617 0.000000000 0.5106383 0.000000000 0.5106383
28 0.012875536 0.03433476 0.29184549 0.15450644 0.004291845 0.4978541 0.000000000 0.5021459
29 0.025641026 0.11111111 0.29344729 0.06552707 0.000000000 0.5014245 0.000000000 0.5014245
30 0.000000000 0.05882353 0.00000000 0.29411765 0.000000000 0.4705882 0.000000000 0.4705882
31 0.016806723 0.05882353 0.20168067 0.21008403 0.000000000 0.5042017 0.000000000 0.5042017
32 0.026785714 0.03571429 0.21428571 0.20535714 0.008928571 0.4821429 0.008928571 0.5000000
33 0.012658228 0.05063291 0.21518987 0.20253165 0.012658228 0.4936709 0.000000000 0.5063291
34 0.000000000 0.11111111 0.00000000 0.22222222 0.111111111 0.4444444 0.000000000 0.1111111
35 0.005494505 0.08791209 0.30769231 0.08791209 0.005494505 0.4945055 0.000000000 0.5000000
36 0.000000000 0.03278689 0.34426230 0.09836066 0.000000000 0.5081967 0.000000000 0.5081967
37 0.000000000 0.00000000 0.00000000 0.20000000 0.200000000 0.4000000 0.000000000 0.6000000
38 0.000000000 0.15384615 0.07692308 0.15384615 0.076923077 0.4615385 0.000000000 0.5384615
39 0.008695652 0.03478261 0.29565217 0.14782609 0.008695652 0.4956522 0.000000000 0.5043478
40 0.000000000 0.03773585 0.22641509 0.20754717 0.000000000 0.5094340 0.000000000 0.5094340
41 0.000000000 0.00000000 0.27272727 0.09090909 0.090909091 0.4545455 0.000000000 0.5454545
42 0.005586592 0.07262570 0.20111732 0.21229050 0.005586592 0.4972067 0.000000000 0.5027933
43 0.000000000 0.00000000 0.00000000 0.20000000 0.000000000 0.6000000 0.000000000 0.6000000
44 0.000000000 0.01886792 0.20754717 0.24528302 0.000000000 0.5094340 0.000000000 0.5094340
45 0.000000000 0.00000000 0.20000000 0.00000000 0.000000000 0.6000000 0.000000000 0.6000000
46 0.000000000 0.00000000 0.14285714 0.14285714 0.285714286 0.2857143 0.000000000 0.5714286
47 0.032786885 0.06557377 0.24590164 0.13114754 0.000000000 0.5081967 0.000000000 0.5081967
48 0.000000000 0.00000000 0.20000000 0.10000000 0.100000000 0.4000000 0.000000000 0.5000000
49 0.000000000 0.00000000 0.07407407 0.37037037 0.037037037 0.4814815 0.000000000 0.5185185
50 0.100000000 0.00000000 0.20000000 0.00000000 0.000000000 0.5000000 0.000000000 0.5000000
51 0.015384615 0.04615385 0.26153846 0.15384615 0.015384615 0.4923077 0.000000000 0.5076923
52 0.000000000 0.00000000 0.00000000 0.28571429 0.000000000 0.5714286 0.000000000 0.5714286
53 0.000000000 0.14285714 0.14285714 0.00000000 0.000000000 0.5714286 0.000000000 0.5714286
54 0.047619048 0.00000000 0.28571429 0.09523810 0.000000000 0.5238095 0.000000000 0.5238095
55 0.013698630 0.02739726 0.16438356 0.27397260 0.000000000 0.5068493 0.000000000 0.5068493
56 0.000000000 0.00000000 0.16666667 0.00000000 0.000000000 0.5000000 0.000000000 0.5000000
57 0.000000000 0.03508772 0.21052632 0.22807018 0.000000000 0.5087719 0.000000000 0.5087719
58 0.142857143 0.00000000 0.14285714 0.00000000 0.000000000 0.5714286 0.000000000 0.5714286
59 0.034168565 0.06833713 0.24601367 0.14806378 0.000000000 0.5011390 0.000000000 0.5011390
60 0.015873016 0.07936508 0.28571429 0.09523810 0.000000000 0.5079365 0.000000000 0.5079365
61 0.023255814 0.00000000 0.23255814 0.20930233 0.000000000 0.5116279 0.000000000 0.5116279
62 0.000000000 0.00000000 0.00000000 0.28571429 0.000000000 0.5714286 0.000000000 0.5714286
63 0.025641026 0.02564103 0.23076923 0.17948718 0.000000000 0.5128205 0.000000000 0.5128205
64 0.000000000 0.00000000 0.00000000 0.16666667 0.000000000 0.5000000 0.000000000 0.5000000
65 0.000000000 0.00000000 0.00000000 0.00000000 0.000000000 0.4000000 0.000000000 0.4000000
66 0.000000000 0.14285714 0.14285714 0.00000000 0.000000000 0.5714286 0.000000000 0.5714286
sm sl modelsize class subclass port proto
1 0.5121951 0 41 Botnet-TCP-SMTP-Attempt-SPAM botnet 443 tcp
2 0.0000000 0 5 Botnet-TCP-SMTP-Attempt-SPAM botnet 6667 tcp
3 0.0000000 0 27 Botnet-TCP-SMTP-Attempt-SPAM botnet 443 tcp
4 0.4000000 0 5 Botnet-TCP-SMTP-Attempt-SPAM botnet 6667 tcp
5 0.0000000 0 139 Botnet-TCP-SMTP-Attempt-SPAM botnet 25 tcp
6 0.5000000 0 12 Botnet-TCP-SMTP-Attempt-SPAM botnet 6667 tcp
7 0.0000000 0 5 Botnet-TCP-SMTP-Attempt-SPAM botnet 6667 tcp
8 0.0000000 0 5 Botnet-TCP-SMTP-Attempt-SPAM botnet 6667 tcp
9 0.0000000 0 6 Botnet-TCP-SMTP-Attempt-SPAM botnet 6667 tcp
10 0.0000000 0 7 Botnet-TCP-SMTP-Attempt-SPAM botnet 6667 tcp
11 0.2000000 0 5 Botnet-TCP-SMTP-Attempt-SPAM botnet 6667 tcp
12 0.6000000 0 5 Botnet-TCP-SMTP-Attempt-SPAM botnet 6667 tcp
13 0.0000000 0 7 Botnet-TCP-SMTP-Attempt-SPAM botnet 6667 tcp
14 0.0000000 0 5 Botnet-TCP-SMTP-Attempt-SPAM botnet 6667 tcp
15 0.0000000 0 7 Botnet-TCP-SMTP-Attempt-SPAM botnet 6667 tcp
16 0.6000000 0 5 Botnet-TCP-SMTP-Attempt-SPAM botnet 6667 tcp
17 0.0000000 0 6 Botnet-TCP-SMTP-Attempt-SPAM botnet 6667 tcp
18 0.0000000 0 5 Botnet-TCP-SMTP-Attempt-SPAM botnet 6667 tcp
19 0.0000000 0 431 Botnet-TCP-SMTP-Attempt-SPAM botnet 25 tcp
20 0.0000000 0 5 Botnet-TCP-SMTP-Attempt-SPAM botnet 6667 tcp
21 0.0000000 0 5 Botnet-TCP-SMTP-Attempt-SPAM botnet 6667 tcp
22 0.0000000 0 5 Botnet-TCP-SMTP-Attempt-SPAM botnet 6667 tcp
23 0.0000000 0 7 Botnet-TCP-SMTP-Attempt-SPAM botnet 6667 tcp
24 0.0000000 0 7 Botnet-TCP-SMTP-Attempt-SPAM botnet 6667 tcp
25 0.0000000 0 135 Botnet-TCP-SMTP-Attempt-SPAM botnet 25 tcp
26 0.0000000 0 131 Botnet-TCP-SMTP-Attempt-SPAM botnet 25 tcp
27 0.0000000 0 47 Botnet-TCP-SMTP-Attempt-SPAM botnet 25 tcp
28 0.0000000 0 233 Botnet-TCP-SMTP-Attempt-SPAM botnet 25 tcp
29 0.0000000 0 351 Botnet-TCP-SMTP-Attempt-SPAM botnet 25 tcp
30 0.0000000 0 17 Botnet-TCP-SMTP-Attempt-SPAM botnet 25 tcp
31 0.0000000 0 119 Botnet-TCP-SMTP-Attempt-SPAM botnet 25 tcp
32 0.0000000 0 112 Botnet-TCP-SMTP-Attempt-SPAM botnet 25 tcp
33 0.0000000 0 79 Botnet-TCP-SMTP-Attempt-SPAM botnet 25 tcp
34 0.4444444 0 9 Botnet-TCP-SMTP-Attempt-SPAM botnet 6667 tcp
35 0.0000000 0 182 Botnet-TCP-SMTP-Attempt-SPAM botnet 25 tcp
36 0.0000000 0 61 Botnet-TCP-SMTP-Attempt-SPAM botnet 25 tcp
37 0.0000000 0 5 Botnet-TCP-SMTP-Attempt-SPAM botnet 25 tcp
38 0.0000000 0 13 Botnet-TCP-SMTP-Attempt-SPAM botnet 25 tcp
39 0.0000000 0 115 Botnet-TCP-SMTP-Attempt-SPAM botnet 25 tcp
40 0.0000000 0 53 Botnet-TCP-SMTP-Attempt-SPAM botnet 25 tcp
41 0.0000000 0 11 Botnet-TCP-SMTP-Attempt-SPAM botnet 6667 tcp
42 0.0000000 0 179 Botnet-TCP-SMTP-Attempt-SPAM botnet 25 tcp
43 0.0000000 0 5 Botnet-TCP-SMTP-Attempt-SPAM botnet 6667 tcp
44 0.0000000 0 53 Botnet-TCP-SMTP-Attempt-SPAM botnet 25 tcp
45 0.0000000 0 5 Botnet-TCP-SMTP-Attempt-SPAM botnet 25 tcp
46 0.0000000 0 7 Botnet-TCP-SMTP-Attempt-SPAM botnet 6667 tcp
47 0.0000000 0 61 Botnet-TCP-SMTP-Attempt-SPAM botnet 25 tcp
48 0.0000000 0 10 Botnet-TCP-SMTP-Attempt-SPAM botnet 6667 tcp
49 0.0000000 0 27 Botnet-TCP-SMTP-Attempt-SPAM botnet 25 tcp
50 0.0000000 0 10 Botnet-TCP-SMTP-Attempt-SPAM botnet 25 tcp
51 0.0000000 0 65 Botnet-TCP-SMTP-Attempt-SPAM botnet 25 tcp
52 0.0000000 0 7 Botnet-TCP-SMTP-Attempt-SPAM botnet 6667 tcp
53 0.0000000 0 7 Botnet-TCP-SMTP-Attempt-SPAM botnet 25 tcp
54 0.0000000 0 21 Botnet-TCP-SMTP-Attempt-SPAM botnet 25 tcp
55 0.0000000 0 73 Botnet-TCP-SMTP-Attempt-SPAM botnet 25 tcp
56 0.0000000 0 6 Botnet-TCP-SMTP-Attempt-SPAM botnet 6667 tcp
57 0.0000000 0 57 Botnet-TCP-SMTP-Attempt-SPAM botnet 25 tcp
58 0.0000000 0 7 Botnet-TCP-SMTP-Attempt-SPAM botnet 25 tcp
59 0.0000000 0 439 Botnet-TCP-SMTP-Attempt-SPAM botnet 25 tcp
60 0.0000000 0 63 Botnet-TCP-SMTP-Attempt-SPAM botnet 25 tcp
61 0.0000000 0 43 Botnet-TCP-SMTP-Attempt-SPAM botnet 25 tcp
62 0.0000000 0 7 Botnet-TCP-SMTP-Attempt-SPAM botnet 25 tcp
63 0.0000000 0 39 Botnet-TCP-SMTP-Attempt-SPAM botnet 25 tcp
64 0.0000000 0 6 Botnet-TCP-SMTP-Attempt-SPAM botnet 25 tcp
65 0.0000000 0 5 Botnet-TCP-SMTP-Attempt-SPAM botnet 6667 tcp
66 0.0000000 0 7 Botnet-TCP-SMTP-Attempt-SPAM botnet 6667 tcp
[ reached 'max' / getOption("max.print") -- omitted 4231 rows ]
feature_vectors_cleaned_aux_normal
sp wp wnp snp ds dm dl ss
1 0.48049281 0.01642710 0.00000000 0.000000000 0.490759754 0.010266940 0.000000000 0
2 0.38461538 0.00000000 0.00000000 0.000000000 0.000000000 0.000000000 0.538461538 0
3 0.00000000 0.01886792 0.18867925 0.264150943 0.000000000 0.037735849 0.471698113 0
4 0.00000000 0.00000000 0.31250000 0.062500000 0.000000000 0.000000000 0.500000000 0
5 0.05433376 0.04786546 0.29495472 0.100905563 0.477360931 0.000000000 0.023285899 0
6 0.49712408 0.00000000 0.00000000 0.001643385 0.000000000 0.500410846 0.000000000 0
7 0.04119850 0.02496879 0.29588015 0.136079900 0.459425718 0.013732834 0.027465668 0
8 0.00000000 0.09523810 0.23809524 0.095238095 0.000000000 0.000000000 0.523809524 0
9 0.38461538 0.00000000 0.00000000 0.000000000 0.000000000 0.000000000 0.538461538 0
10 0.38461538 0.00000000 0.00000000 0.000000000 0.000000000 0.000000000 0.538461538 0
11 0.00000000 0.00000000 0.00000000 0.307692308 0.000000000 0.000000000 0.461538462 0
12 0.03675539 0.04562738 0.29531052 0.120405577 0.470215463 0.000000000 0.030418251 0
13 0.25490196 0.00000000 0.17647059 0.039215686 0.000000000 0.509803922 0.000000000 0
14 0.30188679 0.00000000 0.16981132 0.000000000 0.000000000 0.509433962 0.000000000 0
15 0.32075472 0.00000000 0.15094340 0.000000000 0.000000000 0.509433962 0.000000000 0
16 0.03076923 0.03216783 0.30909091 0.125874126 0.488111888 0.000000000 0.012587413 0
17 0.03047619 0.08380952 0.05904762 0.323809524 0.007619048 0.415238095 0.078095238 0
18 0.03978300 0.04882459 0.09222423 0.316455696 0.007233273 0.408679928 0.084990958 0
19 0.04358655 0.03237858 0.30510585 0.117061021 0.481942715 0.000000000 0.018679950 0
20 0.00000000 0.02985075 0.32835821 0.119402985 0.000000000 0.000000000 0.507462687 0
21 0.02898551 0.07246377 0.23188406 0.144927536 0.000000000 0.000000000 0.507246377 0
22 0.04040404 0.04617605 0.28571429 0.125541126 0.470418470 0.002886003 0.027417027 0
23 0.01298701 0.05194805 0.22077922 0.194805195 0.000000000 0.000000000 0.506493506 0
24 0.00000000 0.09836066 0.13114754 0.229508197 0.000000000 0.049180328 0.442622951 0
25 0.00000000 0.00000000 0.01666667 0.450000000 0.000000000 0.000000000 0.500000000 0
26 0.04347826 0.04347826 0.00000000 0.304347826 0.000000000 0.130434783 0.347826087 0
27 0.31147541 0.00000000 0.16393443 0.000000000 0.000000000 0.508196721 0.000000000 0
28 0.07142857 0.00000000 0.07142857 0.214285714 0.000000000 0.071428571 0.428571429 0
29 0.06666667 0.00000000 0.00000000 0.266666667 0.000000000 0.133333333 0.333333333 0
30 0.00000000 0.00000000 0.00000000 0.250000000 0.125000000 0.250000000 0.125000000 0
31 0.04166667 0.04166667 0.08333333 0.250000000 0.000000000 0.000000000 0.500000000 0
32 0.00000000 0.02941176 0.11764706 0.294117647 0.264705882 0.235294118 0.000000000 0
33 0.00000000 0.04166667 0.08333333 0.291666667 0.000000000 0.000000000 0.500000000 0
34 0.00000000 0.00000000 0.00000000 0.250000000 0.000000000 0.000000000 0.500000000 0
35 0.00000000 0.07692308 0.07692308 0.269230769 0.000000000 0.076923077 0.423076923 0
36 0.00000000 0.00000000 0.00000000 0.300000000 0.000000000 0.000000000 0.500000000 0
37 0.00000000 0.00000000 0.00000000 0.250000000 0.250000000 0.250000000 0.000000000 0
38 0.00000000 0.00000000 0.04166667 0.375000000 0.291666667 0.083333333 0.125000000 0
39 0.00000000 0.00000000 0.00000000 0.357142857 0.500000000 0.000000000 0.000000000 0
40 0.00000000 0.00000000 0.09375000 0.343750000 0.000000000 0.062500000 0.437500000 0
41 0.00000000 0.00000000 0.00000000 0.375000000 0.000000000 0.000000000 0.500000000 0
42 0.00952381 0.07619048 0.07619048 0.323809524 0.000000000 0.000000000 0.504761905 0
43 0.00000000 0.00000000 0.00000000 0.300000000 0.200000000 0.100000000 0.200000000 0
44 0.00000000 0.00000000 0.00000000 0.250000000 0.000000000 0.250000000 0.250000000 0
45 0.00000000 0.05555556 0.05555556 0.277777778 0.000000000 0.000000000 0.500000000 0
46 0.00000000 0.00000000 0.18518519 0.277777778 0.000000000 0.000000000 0.500000000 0
47 0.00000000 0.00000000 0.06250000 0.250000000 0.000000000 0.000000000 0.437500000 0
48 0.00000000 0.08571429 0.17142857 0.200000000 0.171428571 0.200000000 0.142857143 0
49 0.00000000 0.20000000 0.00000000 0.000000000 0.000000000 0.000000000 0.600000000 0
50 0.00000000 0.14285714 0.00000000 0.000000000 0.000000000 0.000000000 0.428571429 0
51 0.00000000 0.00000000 0.00000000 0.409090909 0.363636364 0.045454545 0.090909091 0
52 0.01935484 0.03225806 0.07096774 0.367741935 0.000000000 0.000000000 0.503225806 0
53 0.00000000 0.00000000 0.20833333 0.208333333 0.041666667 0.083333333 0.375000000 0
54 0.03333333 0.03333333 0.10000000 0.233333333 0.000000000 0.233333333 0.233333333 0
55 0.05263158 0.05263158 0.10526316 0.210526316 0.000000000 0.000000000 0.526315789 0
56 0.00000000 0.02564103 0.10256410 0.307692308 0.000000000 0.051282051 0.435897436 0
57 0.09523810 0.04761905 0.00000000 0.190476190 0.000000000 0.142857143 0.285714286 0
58 0.00000000 0.00000000 0.00000000 0.111111111 0.000000000 0.000000000 0.333333333 0
59 0.00000000 0.00000000 0.00000000 0.181818182 0.000000000 0.000000000 0.363636364 0
60 0.00000000 0.00000000 0.00000000 0.000000000 0.000000000 0.000000000 0.285714286 0
61 0.00000000 0.00000000 0.00000000 0.285714286 0.000000000 0.000000000 0.571428571 0
62 0.01538462 0.03076923 0.35384615 0.076923077 0.000000000 0.000000000 0.507692308 0
63 0.01694915 0.06779661 0.06779661 0.322033898 0.000000000 0.016949153 0.491525424 0
64 0.06666667 0.20000000 0.18974359 0.035897436 0.000000000 0.005128205 0.497435897 0
65 0.01023018 0.01790281 0.02557545 0.442455243 0.002557545 0.496163683 0.002557545 0
66 0.00000000 0.04255319 0.08510638 0.340425532 0.021276596 0.000000000 0.489361702 0
sm sl modelsize class subclass port proto
1 0.000000000 0.501026694 487 Normal-TCP-HTTP normal 80 tcp
2 0.000000000 0.538461538 13 Normal-TCP-HTTP normal 443 tcp
3 0.132075472 0.377358491 53 Normal-TCP-HTTP normal 80 tcp
4 0.062500000 0.437500000 16 Normal-TCP-HTTP normal 80 tcp
5 0.495472186 0.005174644 773 Normal-TCP-HTTP normal 80 tcp
6 0.000000000 0.500410846 1217 Normal-TCP-HTTP normal 80 tcp
7 0.478152310 0.022471910 801 Normal-TCP-HTTP normal 80 tcp
8 0.047619048 0.476190476 21 Normal-TCP-HTTP normal 443 tcp
9 0.076923077 0.461538462 13 Normal-TCP-HTTP normal 443 tcp
10 0.000000000 0.538461538 13 Normal-TCP-HTTP normal 443 tcp
11 0.307692308 0.153846154 13 Normal-TCP-HTTP normal 80 tcp
12 0.491761724 0.008871990 789 Normal-TCP-HTTP normal 80 tcp
13 0.000000000 0.509803922 51 Normal-TCP-HTTP normal 443 tcp
14 0.000000000 0.509433962 53 Normal-TCP-HTTP normal 443 tcp
15 0.000000000 0.509433962 53 Normal-TCP-HTTP normal 443 tcp
16 0.497902098 0.002797203 715 Normal-TCP-HTTP normal 80 tcp
17 0.000000000 0.500952381 525 Normal-TCP-HTTP normal 443 tcp
18 0.000000000 0.500904159 553 Normal-TCP-HTTP normal 443 tcp
19 0.494396015 0.006226650 803 Normal-TCP-HTTP normal 80 tcp
20 0.000000000 0.507462687 67 Normal-TCP-HTTP normal 443 tcp
21 0.000000000 0.507246377 69 Normal-TCP-HTTP normal 443 tcp
22 0.486291486 0.014430014 693 Normal-TCP-HTTP normal 80 tcp
23 0.000000000 0.506493506 77 Normal-TCP-HTTP normal 443 tcp
24 0.360655738 0.131147541 61 Normal-TCP-HTTP normal 80 tcp
25 0.466666667 0.033333333 60 Normal-TCP-HTTP normal 80 tcp
26 0.173913043 0.304347826 23 Normal-TCP-HTTP normal 80 tcp
27 0.000000000 0.508196721 61 Normal-TCP-HTTP normal 443 tcp
28 0.000000000 0.500000000 14 Normal-TCP-HTTP normal 443 tcp
29 0.000000000 0.466666667 15 Normal-TCP-HTTP normal 80 tcp
30 0.125000000 0.375000000 8 Normal-TCP-HTTP normal 80 tcp
31 0.000000000 0.500000000 24 Normal-TCP-HTTP normal 80 tcp
32 0.176470588 0.323529412 34 Normal-TCP-HTTP normal 80 tcp
33 0.208333333 0.291666667 24 Normal-TCP-HTTP normal 80 tcp
34 0.250000000 0.250000000 8 Normal-TCP-HTTP normal 80 tcp
35 0.038461538 0.461538462 26 Normal-TCP-HTTP normal 80 tcp
36 0.300000000 0.200000000 10 Normal-TCP-HTTP normal 80 tcp
37 0.125000000 0.375000000 8 Normal-TCP-HTTP normal 80 tcp
38 0.125000000 0.375000000 24 Normal-TCP-HTTP normal 80 tcp
39 0.000000000 0.500000000 14 Normal-TCP-HTTP normal 80 tcp
40 0.125000000 0.375000000 32 Normal-TCP-HTTP normal 80 tcp
41 0.125000000 0.375000000 16 Normal-TCP-HTTP normal 80 tcp
42 0.076190476 0.428571429 105 Normal-TCP-HTTP normal 80 tcp
43 0.200000000 0.300000000 10 Normal-TCP-HTTP normal 80 tcp
44 0.250000000 0.250000000 8 Normal-TCP-HTTP normal 80 tcp
45 0.111111111 0.388888889 18 Normal-TCP-HTTP normal 80 tcp
46 0.296296296 0.203703704 54 Normal-TCP-HTTP normal 80 tcp
47 0.187500000 0.250000000 16 Normal-TCP-HTTP normal 80 tcp
48 0.142857143 0.371428571 35 Normal-TCP-HTTP normal 80 tcp
49 0.000000000 0.600000000 5 Normal-TCP-HTTP normal 80 tcp
50 0.000000000 0.428571429 7 Normal-TCP-HTTP normal 80 tcp
51 0.090909091 0.409090909 22 Normal-TCP-HTTP normal 80 tcp
52 0.070967742 0.432258065 155 Normal-TCP-HTTP normal 80 tcp
53 0.000000000 0.500000000 24 Normal-TCP-HTTP normal 80 tcp
54 0.133333333 0.333333333 30 Normal-TCP-HTTP normal 80 tcp
55 0.000000000 0.526315789 19 Normal-TCP-HTTP normal 80 tcp
56 0.333333333 0.153846154 39 Normal-TCP-HTTP normal 80 tcp
57 0.000000000 0.428571429 21 Normal-TCP-HTTP normal 80 tcp
58 0.000000000 0.333333333 9 Normal-TCP-HTTP normal 80 tcp
59 0.181818182 0.181818182 11 Normal-TCP-HTTP normal 80 tcp
60 0.000000000 0.285714286 7 Normal-TCP-HTTP normal 80 tcp
61 0.000000000 0.571428571 7 Normal-TCP-HTTP normal 80 tcp
62 0.000000000 0.507692308 65 Normal-TCP-HTTP normal 443 tcp
63 0.338983051 0.169491525 59 Normal-TCP-HTTP normal 80 tcp
64 0.025641026 0.476923077 195 Normal-TCP-HTTP normal 80 tcp
65 0.002557545 0.498721228 391 Normal-TCP-HTTP normal 80 tcp
66 0.319148936 0.191489362 47 Normal-TCP-HTTP normal 80 tcp
[ reached 'max' / getOption("max.print") -- omitted 2536 rows ]
feature_vectors_cleaned_aux_rest <- feature_vectors_cleaned %>% filter(class != 'Botnet-TCP-SMTP-Attempt-SPAM') %>% filter(class != 'Normal-TCP-HTTP')
feature_vectors_cleaned_aux_rest %>% group_by(class) %>% summarise(n=n()) %>% arrange(desc(n))
# A tibble: 45 x 2
class n
<fct> <int>
1 Botnet-TCP-HTTP-Established-Ad 487
2 Botnet-TCP-HTTP-Established 377
3 Botnet-UDP-DNS 301
4 Botnet-TCP-HTTP-Established-SSL 210
5 Botnet-TCP-HTTP-CC-Not-Encrypted 68
6 Normal-UDP-NTP-server 67
7 Botnet-TCP-HTTPS-Established-Microsoft 58
8 Botnet-TCP-HTTP-Google-Net-Established 50
9 Botnet-TCP-SMTP-Established-SPAM 49
10 Botnet-TCP-HTTP-CC-Plain-Encrypted-Data 38
# ... with 35 more rows
aux1 <- rbind(feature_vectors_cleaned_aux_botnet[1:500,],feature_vectors_cleaned_aux_normal[1:500,])
aux1 %>% group_by(class) %>% summarise(n=n()) %>% arrange(desc(n))
# A tibble: 2 x 2
class n
<fct> <int>
1 Botnet-TCP-SMTP-Attempt-SPAM 500
2 Normal-TCP-HTTP 500
aux <- rbind(feature_vectors_cleaned_aux_rest,aux1)
aux %>% group_by(class) %>% summarise(n=n()) %>% arrange(desc(n))
# A tibble: 47 x 2
class n
<fct> <int>
1 Botnet-TCP-SMTP-Attempt-SPAM 500
2 Normal-TCP-HTTP 500
3 Botnet-TCP-HTTP-Established-Ad 487
4 Botnet-TCP-HTTP-Established 377
5 Botnet-UDP-DNS 301
6 Botnet-TCP-HTTP-Established-SSL 210
7 Botnet-TCP-HTTP-CC-Not-Encrypted 68
8 Normal-UDP-NTP-server 67
9 Botnet-TCP-HTTPS-Established-Microsoft 58
10 Botnet-TCP-HTTP-Google-Net-Established 50
# ... with 37 more rows
feature_vectors_cleaned <- aux
set.seed(212)
trainIndex <- createDataPartition(feature_vectors_cleaned$subclass, p=0.70, list=FALSE)
data_training <- feature_vectors_cleaned[ trainIndex,]
data_testing <- feature_vectors_cleaned[-trainIndex,]
#data_train = data_train %>% filter(length>5)
train <- upSample(x = data_training, y = data_training$subclass, yname="class")
training <- train[,-c(11,16)]
testing <- data_testing[,-c(11)]
training
testing
nrow(training)
[1] 3322
nrow(feature_vectors_cleaned)
[1] 3089
ctrl_fast <- trainControl(method="cv",
repeats=2,
number=10,
summaryFunction=twoClassSummary,
verboseIter=T,
classProbs=TRUE,
allowParallel = TRUE)
library(factoextra)
library(cluster)
library(NbClust)
feature_vector_training = training[,-c(11,12,13,14)]
# K-means clustering
set.seed(321)
#km.res <- kmeans(feature_vector_training, 3, nstart = 25)
km.res <- kmeans(feature_vector_training, 7, nstart = 25)
# k-means group number of each observation
km.res$cluster
[1] 2 1 2 1 1 5 5 4 2 5 5 2 3 3 3 2 1 2 3 2 2 1 5 3 2 1 3 3 3 3 3 6 3 4 2 2 1 5 2 1 7 3 2 1 1 2 1 1 1 1 3 1 3 1 1 1 3 2 2 2 2 2 2 6 2 2 2 2 6 6 5 6 4 4 2 2
[77] 2 4 4 2 2 2 2 3 6 4 4 2 2 4 4 6 6 2 2 3 4 2 2 2 4 2 2 6 4 3 2 5 2 2 6 2 2 2 2 4 4 2 2 2 5 7 3 6 3 5 6 2 2 3 4 2 6 5 2 4 4 3 2 6 4 4 4 4 3 5 2 4 4 4 4 4
[153] 4 6 6 4 6 3 4 5 6 4 6 3 4 4 6 4 4 4 6 2 4 4 3 3 4 6 6 5 4 6 3 4 5 6 5 4 3 4 4 4 4 4 6 6 6 6 6 6 3 3 5 4 4 1 2 2 4 2 4 6 5 4 4 2 7 6 6 6 3 6 6 7 3 4 5 6
[229] 7 4 5 2 6 5 3 1 1 1 6 4 6 4 6 6 5 6 6 6 6 6 6 6 7 5 7 5 1 3 2 1 2 1 2 1 1 1 1 2 2 3 2 5 5 5 1 7 7 7 7 1 3 1 1 2 3 2 2 3 3 1 1 1 1 2 2 1 3 1 1 2 2 3 4 3
[305] 1 1 1 2 2 3 1 2 1 3 1 2 1 2 1 2 3 2 2 2 2 2 1 1 2 5 5 1 1 5 1 1 1 1 1 1 1 1 1 1 1 6 1 1 6 2 1 1 1 5 2 1 2 5 2 2 1 4 1 6 2 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[381] 1 1 1 1 1 1 1 1 1 3 2 2 1 1 3 2 1 2 2 2 2 5 1 3 5 3 2 1 2 1 2 2 1 1 1 1 3 1 2 1 5 2 2 1 1 1 3 3 1 1 2 2 2 1 3 2 2 1 1 3 1 2 6 2 6 5 5 5 5 2 2 2 2 2 2 3
[457] 2 2 2 2 2 2 2 6 1 6 6 6 3 3 6 6 3 6 6 2 1 1 2 1 2 6 4 1 3 2 2 2 3 1 3 2 3 1 1 1 1 1 1 2 2 1 2 1 2 3 1 1 6 2 3 1 1 1 7 1 7 2 3 1 3 3 3 2 2 3 2 1 1 1 1 3
[533] 1 2 1 2 5 7 4 3 3 3 1 1 2 2 1 2 3 3 2 2 2 1 1 3 1 5 1 3 2 1 1 1 5 1 2 2 3 2 2 2 3 3 3 3 1 2 1 2 2 2 3 3 2 3 1 3 1 1 1 2 1 1 1 1 3 3 3 2 2 1 2 2 2 2 1 1
[609] 1 6 2 1 1 2 1 1 1 3 1 1 2 2 3 2 1 1 1 1 3 3 1 1 4 2 2 2 1 2 3 2 2 1 5 2 2 1 2 2 1 1 1 2 3 2 2 3 3 3 3 1 1 1 1 1 2 2 2 3 1 1 1 2 2 3 2 1 1 1 3 1 1 4 1 2
[685] 3 3 2 1 3 3 1 1 1 2 3 5 1 1 1 5 2 2 2 3 1 3 3 1 2 1 1 2 2 2 3 2 1 3 3 1 7 2 3 1 3 2 7 1 2 1 1 2 1 1 1 2 2 3 1 3 1 1 2 1 1 1 3 1 3 2 1 1 1 3 2 1 3 2 4 1
[761] 1 3 1 2 3 2 1 1 1 1 1 2 1 4 3 3 2 1 1 2 3 2 3 1 1 2 3 2 2 2 2 3 7 2 3 1 3 6 3 1 2 2 2 3 1 1 1 3 1 1 1 1 1 1 1 2 3 2 1 4 1 1 1 3 1 2 1 4 3 3 1 6 2 3 1 1
[837] 2 1 2 1 1 1 2 1 1 1 1 1 4 1 2 1 7 1 1 1 1 1 2 2 1 1 1 2 1 1 2 2 3 1 1 2 1 2 2 1 1 2 3 3 3 2 3 3 1 1 1 1 2 1 1 2 2 2 1 1 3 1 1 1 1 2 1 3 3 2 2 2 2 1 2 1
[913] 3 1 5 2 1 1 1 1 2 2 4 2 1 1 3 2 2 2 1 3 2 1 1 1 3 3 2 3 3 1 1 1 2 1 1 1 1 3 3 1 3 3 2 1 1 1 1 1 2 2 1 2 1 3 2 1 2 2 3 2 2 2 5 3 2 2 2 5 1 1 1 1 1 2 2 1
[989] 1 1 1 3 1 1 1 3 1 2 1 3
[ reached getOption("max.print") -- omitted 2322 entries ]
# Visualize k-means clusters
fviz_cluster(km.res, data = feature_vector_training, geom = "point",
stand = FALSE, ellipse.type = "norm")
set.seed(321)
# Compute and plot wss for k = 2 to k = 15
k.max <- 15 # Maximal number of clusters
data <- feature_vector_training
wss <- sapply(1:k.max,
function(k){kmeans(data, k, nstart=10 )$tot.withinss})
plot(1:k.max, wss,
type="b", pch = 19, frame = FALSE,
xlab="Number of clusters K",
ylab="Total within-clusters sum of squares")
abline(v = 3, lty =2)
set.seed(322)
k.max <- 10
data <- feature_vector_training
nrow(data)
[1] 3322
sil <- rep(0, k.max)
# Compute the average silhouette width for
# k = 2 to k = 15
for(i in 2:k.max){
km.res <- kmeans(data, centers = i, nstart = 25)
ss <- silhouette(km.res$cluster, dist(data))
sil[i] <- mean(ss[, 3])
}
# Plot the average silhouette width
plot(1:k.max, sil, type = "b", pch = 19,
frame = FALSE, xlab = "Number of clusters k")
abline(v = which.max(sil), lty = 2)
cold_start_data <- function(training.sampled,testing,settings){
library(doParallel)
cl <- makeCluster(2)
registerDoParallel(cl)
size_training <- nrow(training.sampled)
split_size_training = size_training / 200
testing_result = data.frame(numeric(nrow(testing)))
count_random <- foreach(i=1:split_size_training) %dopar% {
200 * i
}
metric <- numeric(split_size_training)
metric_t <- numeric(split_size_training)
#metric <- foreach(i=1:split_size_training) %do% {
for(i in c(1:split_size_training)){
#library(caret)
#library(dplyr)
count <- 200 * i
aux_training_set <- training.sampled[c(1:count), ]#training[sample(size_training, count), ]
clusters <- kmeans(aux_training_set[,-c(11,12,13,14)],3,nstart = 25)
aux_training_set_cluster <- cbind(aux_training_set, cluster = clusters$cluster)
result_vector <- numeric(nrow(testing))
result_vector_trainning <- numeric(nrow(aux_training_set))
for (j in c(1:3)){
cluster_data <- dplyr::filter(aux_training_set_cluster, cluster == j)
new_rfFit <- train(subclass ~ sp+wp+wnp+snp+ds+dm+dl+ss+sm+sl,
data = cluster_data,
metric="ROC",
method = "rf",
trControl = settings)
#Testing predict
predsrfprobs <- predict(new_rfFit,testing,type='prob')
for (k in c(1:length(result_vector))){
if(predsrfprobs$botnet[k] > 0.5){
result_vector[k] <- result_vector[k] + 1
}
else{
result_vector[k] <- result_vector[k] - 1
}
}
#Trainning predict
predsrfprobs_t <- predict(new_rfFit,aux_training_set,type='prob')
for (k in c(1:length(result_vector_trainning))){
if(predsrfprobs_t$botnet[k] > 0.5){
result_vector_trainning[k] <- result_vector_trainning[k] + 1
}
else{
result_vector_trainning[k] <- result_vector_trainning[k] - 1
}
}
}
a = ifelse(result_vector > 0,'botnet','normal')
b <- ifelse(result_vector_trainning > 0,'botnet','normal')
testing_result <- cbind(testing_result,'result' = result_vector)
cm <- confusionMatrix(a,testing$subclass)
metric[i] <- cm$byClass['F1']#cm$overall[1]
cm_t <- confusionMatrix(b,aux_training_set$subclass)
metric_t[i] <- cm_t$byClass['F1']
#list('metric' = metric, 'metric_t' = metric_t)
}
output <- do.call(rbind, Map(data.frame, data_count=count_random, metric=metric))
output_t <- do.call(rbind, Map(data.frame, data_count=count_random, metric=metric_t))
list_result <- list('output' = output, 'output_t' = output_t, 'testing_result' = testing_result)
}
cold_start_data_only_rf <- function(training.sampled,testing,settings){
size_training <- nrow(training.sampled)
split_size_training = size_training / 200
testing_result = data.frame(numeric(nrow(testing)))
count_random <- foreach(i=1:split_size_training) %dopar% {
200 * i
}
metric <- numeric(split_size_training)
metric_t <- numeric(split_size_training)
#metric <- foreach(i=1:split_size_training) %do% {
for(i in c(1:split_size_training)){
#library(caret)
#library(dplyr)
count <- 200 * i
aux_training_set <- training.sampled[c(1:count), ]#training[sample(size_training, count), ]
new_rfFit <- train(subclass ~ sp+wp+wnp+snp+ds+dm+dl+ss+sm+sl,
data = aux_training_set,
metric="ROC",
method = "rf",
trControl = settings)
#Testing predict
predsrfprobs <- predict(new_rfFit,testing,type='prob')
predsrf <- ifelse(predsrfprobs$botnet >=0.5,'botnet','normal')
cm <- confusionMatrix(predsrf,testing$subclass)
metric[i] <- cm$byClass['F1']
#Trainning predict
predsrfprobs_t <- predict(new_rfFit,aux_training_set,type='prob')
predsrf_t <- ifelse(predsrfprobs_t$botnet >= 0.5,'botnet','normal')
cm_t <- confusionMatrix(predsrf_t,aux_training_set$subclass)
metric_t[i] <- cm_t$byClass['F1']
}
output <- do.call(rbind, Map(data.frame, data_count=count_random, metric=metric))
output_t <- do.call(rbind, Map(data.frame, data_count=count_random, metric=metric_t))
list_result <- list('output' = output, 'output_t' = output_t, 'testing_result' = testing_result)
}
generate_data_noisy <- function(dataset, porcent){
list_aux <- sample(nrow(dataset) ,porcent)
noisy_data_sample <- dataset[list_aux,]
no_noisy_data_sample <- dataset[-list_aux,]
noisy_data_sample_b <- noisy_data_sample %>% filter(class == 'Botnet')
noisy_data_sample_n <- noisy_data_sample %>% filter(class == 'Normal')
noisy_data_sample_b$class <- as.character(noisy_data_sample_b$class)
noisy_data_sample_b$class[noisy_data_sample_b$class == 'Botnet'] <- 'Normal'
noisy_data_sample_b$class <- as.factor(noisy_data_sample_b$class)
noisy_data_sample_n$class <- as.character(noisy_data_sample_n$class)
noisy_data_sample_n$class[noisy_data_sample_n$class == 'Normal'] <- 'Botnet'
noisy_data_sample_n$class <- as.factor(noisy_data_sample_n$class)
noisy_data <- rbind(noisy_data_sample_b, noisy_data_sample_n)
training_noisy <- rbind(no_noisy_data_sample,noisy_data)
training_noisy <- training_noisy[sample(nrow(training_noisy),nrow(training_noisy)),]
return(training_noisy)
}
get_ELA_measure <- function(A0, Ax){
RLA <- (A0 - Ax) / A0
FA0 <- (100 - A0) / A0
ELA <- RLA + FA0
return(ELA)
}
randomForest_performace <- function(training_data, testing_data){
rfFit <- train(class ~ sp+wp+wnp+snp+ds+dm+dl+ss+sm+sl,
data = training_data,
metric="ROC",
method = "rf",
trControl = settings)
predsrfprobs <- predict(rfFit,testing_data,type='prob')
predsrf <- ifelse(predsrfprobs$Botnet >=0.5,'Botnet','Normal')
cm <- confusionMatrix(predsrf,testing_data$class)
result <- cm$byClass
return(result)
}
training
testing
output_1 <- result$output
output_t_1 <- result$output_t
output_1
gg <- ggplot(data = output_1)
gg + geom_line(aes(x = data_count, y = metric),color = 'red') +
labs(title="Random Forest through data training size",
#subtitle="Drawn from Long Data format",
caption="Source: CTU-13",
y="F1 Score",
color=NULL)
first_training_sample <- training.sampled_1[1:200,]
ggplot(first_training_sample) + geom_bar(aes(subclass))
class_distribution <- first_training_sample %>% group_by(class) %>% summarise(n = n()) %>% arrange(desc(n))
ggplot(first_training_sample) + geom_bar(aes(class)) + theme(axis.text.x = element_text(angle = 90, hjust = 1))
testing_result <- result$testing_result
#testing_result
gg <- ggplot(data = output_t_1)
gg + geom_line(aes(x = data_count, y = metric),color = 'blue') +
labs(title="Random Forest through data training size",
#subtitle="Drawn from Long Data format",
caption="Source: CTU-13",
y="F1 Score",
color=NULL)
output_t_aux_1 <- output_t_1
names(output_t_aux_1) <- c('data_count_t','metric_t')
output_result_1 <- cbind(output_1,output_t_aux_1)
gg <- ggplot(data = output_result_1)
gg + geom_line(aes(x = data_count, y = metric),color = 'red') + geom_line(aes(x = data_count_t, y = metric_t),color = 'blue') +
labs(title="Random Forest through data training size",
#subtitle="Drawn from Long Data format",
caption="Source: CTU-13",
y="F1 Score",
color=NULL)
output_2 <- result_2$output
output_t_2 <- result_2$output_t
output_2
gg <- ggplot(data = output_2)
gg + geom_line(aes(x = data_count, y = metric),color = 'red') +
labs(title="Random Forest through data training size",
#subtitle="Drawn from Long Data format",
caption="Source: CTU-13",
y="F1 Score",
color=NULL)
first_training_sample <- training.sampled_2[1:200,]
ggplot(first_training_sample) + geom_bar(aes(subclass))
class_distribution <- first_training_sample %>% group_by(class) %>% summarise(n = n()) %>% arrange(desc(n))
ggplot(first_training_sample) + geom_bar(aes(class)) + theme(axis.text.x = element_text(angle = 90, hjust = 1))
testing_result_2 <- result$testing_result
#testing_result
gg <- ggplot(data = output_t_2)
gg + geom_line(aes(x = data_count, y = metric),color = 'blue') +
labs(title="Random Forest through data training size",
#subtitle="Drawn from Long Data format",
caption="Source: CTU-13",
y="F1 Score",
color=NULL)
output_t_aux_2 <- output_t_2
names(output_t_aux_2) <- c('data_count_t','metric_t')
output_result_2 <- cbind(output_2,output_t_aux_2)
gg <- ggplot(data = output_result_2)
gg + geom_line(aes(x = data_count, y = metric),color = 'red') + geom_line(aes(x = data_count_t, y = metric_t),color = 'blue') +
labs(title="Random Forest through data training size",
#subtitle="Drawn from Long Data format",
caption="Source: CTU-13",
y="F1 Score",
color=NULL)
output_3 <- result_3$output
output_t_3 <- result_3$output_t
output_3
gg <- ggplot(data = output_3)
gg + geom_line(aes(x = data_count, y = metric),color = 'red') +
labs(title="Random Forest through data training size",
#subtitle="Drawn from Long Data format",
caption="Source: CTU-13",
y="F1 Score",
color=NULL)
first_training_sample <- training.sampled_3[1:200,]
ggplot(first_training_sample) + geom_bar(aes(subclass))
class_distribution <- first_training_sample %>% group_by(class) %>% summarise(n = n()) %>% arrange(desc(n))
ggplot(first_training_sample) + geom_bar(aes(class)) + theme(axis.text.x = element_text(angle = 90, hjust = 1))
testing_result_3 <- result$testing_result
#testing_result
gg <- ggplot(data = output_t_3)
gg + geom_line(aes(x = data_count, y = metric),color = 'blue') +
labs(title="Random Forest through data training size",
#subtitle="Drawn from Long Data format",
caption="Source: CTU-13",
y="F1 Score",
color=NULL)
output_t_aux_3 <- output_t_3
names(output_t_aux_3) <- c('data_count_t','metric_t')
output_result_3 <- cbind(output_3,output_t_aux_3)
gg <- ggplot(data = output_result_3)
gg + geom_line(aes(x = data_count, y = metric),color = 'red') + geom_line(aes(x = data_count_t, y = metric_t),color = 'blue') +
labs(title="Random Forest through data training size",
#subtitle="Drawn from Long Data format",
caption="Source: CTU-13",
y="F1 Score",
color=NULL)
output_4 <- result_4$output
output_t_4 <- result_4$output_t
output_4
gg <- ggplot(data = output_4)
gg + geom_line(aes(x = data_count, y = metric),color = 'red') +
labs(title="Random Forest through data training size",
#subtitle="Drawn from Long Data format",
caption="Source: CTU-13",
y="F1 Score",
color=NULL)
first_training_sample <- training.sampled_4[1:200,]
ggplot(first_training_sample) + geom_bar(aes(subclass))
class_distribution <- first_training_sample %>% group_by(class) %>% summarise(n = n()) %>% arrange(desc(n))
ggplot(first_training_sample) + geom_bar(aes(class)) + theme(axis.text.x = element_text(angle = 90, hjust = 1))
testing_result_4 <- result$testing_result
#testing_result
gg <- ggplot(data = output_t_4)
gg + geom_line(aes(x = data_count, y = metric),color = 'blue') +
labs(title="Random Forest through data training size",
#subtitle="Drawn from Long Data format",
caption="Source: CTU-13",
y="F1 Score",
color=NULL)
output_t_aux_4 <- output_t_4
names(output_t_aux_4) <- c('data_count_t','metric_t')
output_result_4 <- cbind(output_4,output_t_aux_4)
gg <- ggplot(data = output_result_4)
gg + geom_line(aes(x = data_count, y = metric),color = 'red') + geom_line(aes(x = data_count_t, y = metric_t),color = 'blue') +
labs(title="Random Forest through data training size",
#subtitle="Drawn from Long Data format",
caption="Source: CTU-13",
y="F1 Score",
color=NULL)
output_5 <- result_5$output
output_t_5 <- result_5$output_t
output_5
gg <- ggplot(data = output_5)
gg + geom_line(aes(x = data_count, y = metric),color = 'red') +
labs(title="Random Forest through data training size",
#subtitle="Drawn from Long Data format",
caption="Source: CTU-13",
y="F1 Score",
color=NULL)
first_training_sample <- training.sampled_5[1:200,]
ggplot(first_training_sample) + geom_bar(aes(subclass))
class_distribution <- first_training_sample %>% group_by(class) %>% summarise(n = n()) %>% arrange(desc(n))
ggplot(first_training_sample) + geom_bar(aes(class)) + theme(axis.text.x = element_text(angle = 90, hjust = 1))
testing_result_5 <- result$testing_result
#testing_result
gg <- ggplot(data = output_t_5)
gg + geom_line(aes(x = data_count, y = metric),color = 'blue') +
labs(title="Random Forest through data training size",
#subtitle="Drawn from Long Data format",
caption="Source: CTU-13",
y="F1 Score",
color=NULL)
output_t_aux_5 <- output_t_5
names(output_t_aux_5) <- c('data_count_t','metric_t')
output_result_5 <- cbind(output_5,output_t_aux_5)
gg <- ggplot(data = output_result_5)
gg + geom_line(aes(x = data_count, y = metric),color = 'red') + geom_line(aes(x = data_count_t, y = metric_t),color = 'blue') +
labs(title="Random Forest through data training size",
#subtitle="Drawn from Long Data format",
caption="Source: CTU-13",
y="F1 Score",
color=NULL)
rf_output_1 <- rf_result_1$output
rf_output_t_1 <- rf_result_1$output_t
rf_output_1
gg <- ggplot(data = rf_output_1)
gg + geom_line(aes(x = data_count, y = metric),color = 'red') +
labs(title="Random Forest through data training size",
#subtitle="Drawn from Long Data format",
caption="Source: CTU-13",
y="F1 Score",
color=NULL)
rf_first_training_sample <- rf_training.sampled_1[1:200,]
ggplot(rf_first_training_sample) + geom_bar(aes(subclass))
rf_class_distribution <- rf_first_training_sample %>% group_by(class) %>% summarise(n = n()) %>% arrange(desc(n))
ggplot(rf_first_training_sample) + geom_bar(aes(class)) + theme(axis.text.x = element_text(angle = 90, hjust = 1))
#rf_testing_result_1 <- rf_result_1$testing_result
#testing_result
gg <- ggplot(data = rf_output_t_1)
gg + geom_line(aes(x = data_count, y = metric),color = 'blue') +
labs(title="Random Forest through data training size",
#subtitle="Drawn from Long Data format",
caption="Source: CTU-13",
y="F1 Score",
color=NULL)
rf_output_t_aux_1 <- rf_output_t_1
names(rf_output_t_aux_1) <- c('data_count_t','metric_t')
rf_output_result_1 <- cbind(rf_output_1,rf_output_t_aux_1)
gg <- ggplot(data = rf_output_result_1)
gg + geom_line(aes(x = data_count, y = metric),color = 'red') + geom_line(aes(x = data_count_t, y = metric_t),color = 'blue') +
labs(title="Random Forest through data training size",
#subtitle="Drawn from Long Data format",
caption="Source: CTU-13",
y="F1 Score",
color=NULL)
rf_output_2 <- rf_result_2$output
rf_output_t_2 <- rf_result_2$output_t
rf_output_2
gg <- ggplot(data = rf_output_2)
gg + geom_line(aes(x = data_count, y = metric),color = 'red') +
labs(title="Random Forest through data training size",
#subtitle="Drawn from Long Data format",
caption="Source: CTU-13",
y="F1 Score",
color=NULL)
rf_first_training_sample <- rf_training.sampled_2[1:200,]
ggplot(rf_first_training_sample) + geom_bar(aes(subclass))
rf_class_distribution <- rf_first_training_sample %>% group_by(class) %>% summarise(n = n()) %>% arrange(desc(n))
ggplot(rf_first_training_sample) + geom_bar(aes(class)) + theme(axis.text.x = element_text(angle = 90, hjust = 1))
#rf_testing_result_2 <- rf_result_2$testing_result
#testing_result
gg <- ggplot(data = rf_output_t_2)
gg + geom_line(aes(x = data_count, y = metric),color = 'blue') +
labs(title="Random Forest through data training size",
#subtitle="Drawn from Long Data format",
caption="Source: CTU-13",
y="F1 Score",
color=NULL)
rf_output_t_aux_2 <- rf_output_t_2
names(rf_output_t_aux_2) <- c('data_count_t','metric_t')
rf_output_result_2 <- cbind(rf_output_2,rf_output_t_aux_2)
gg <- ggplot(data = rf_output_result_2)
gg + geom_line(aes(x = data_count, y = metric),color = 'red') + geom_line(aes(x = data_count_t, y = metric_t),color = 'blue') +
labs(title="Random Forest through data training size",
#subtitle="Drawn from Long Data format",
caption="Source: CTU-13",
y="F1 Score",
color=NULL)
rf_output_3 <- rf_result_3$output
rf_output_t_3 <- rf_result_3$output_t
rf_output_3
gg <- ggplot(data = rf_output_3)
gg + geom_line(aes(x = data_count, y = metric),color = 'red') +
labs(title="Random Forest through data training size",
#subtitle="Drawn from Long Data format",
caption="Source: CTU-13",
y="F1 Score",
color=NULL)
rf_first_training_sample <- rf_training.sampled_3[1:200,]
ggplot(rf_first_training_sample) + geom_bar(aes(subclass))
rf_class_distribution <- rf_first_training_sample %>% group_by(class) %>% summarise(n = n()) %>% arrange(desc(n))
ggplot(rf_first_training_sample) + geom_bar(aes(class)) + theme(axis.text.x = element_text(angle = 90, hjust = 1))
#rf_testing_result_2 <- rf_result_2$testing_result
#testing_result
gg <- ggplot(data = rf_output_t_3)
gg + geom_line(aes(x = data_count, y = metric),color = 'blue') +
labs(title="Random Forest through data training size",
#subtitle="Drawn from Long Data format",
caption="Source: CTU-13",
y="F1 Score",
color=NULL)
rf_output_t_aux_3 <- rf_output_t_3
names(rf_output_t_aux_3) <- c('data_count_t','metric_t')
rf_output_result_3 <- cbind(rf_output_3,rf_output_t_aux_3)
gg <- ggplot(data = rf_output_result_3)
gg + geom_line(aes(x = data_count, y = metric),color = 'red') + geom_line(aes(x = data_count_t, y = metric_t),color = 'blue') +
labs(title="Random Forest through data training size",
#subtitle="Drawn from Long Data format",
caption="Source: CTU-13",
y="F1 Score",
color=NULL)
rf_output_4 <- rf_result_4$output
rf_output_t_4 <- rf_result_4$output_t
rf_output_4
gg <- ggplot(data = rf_output_4)
gg + geom_line(aes(x = data_count, y = metric),color = 'red') +
labs(title="Random Forest through data training size",
#subtitle="Drawn from Long Data format",
caption="Source: CTU-13",
y="F1 Score",
color=NULL)
rf_first_training_sample <- rf_training.sampled_4[1:200,]
ggplot(rf_first_training_sample) + geom_bar(aes(subclass))
rf_class_distribution <- rf_first_training_sample %>% group_by(class) %>% summarise(n = n()) %>% arrange(desc(n))
ggplot(rf_first_training_sample) + geom_bar(aes(class)) + theme(axis.text.x = element_text(angle = 90, hjust = 1))
#rf_testing_result_2 <- rf_result_2$testing_result
#testing_result
gg <- ggplot(data = rf_output_t_4)
gg + geom_line(aes(x = data_count, y = metric),color = 'blue') +
labs(title="Random Forest through data training size",
#subtitle="Drawn from Long Data format",
caption="Source: CTU-13",
y="F1 Score",
color=NULL)
rf_output_t_aux_4 <- rf_output_t_4
names(rf_output_t_aux_4) <- c('data_count_t','metric_t')
rf_output_result_4 <- cbind(rf_output_4,rf_output_t_aux_4)
gg <- ggplot(data = rf_output_result_4)
gg + geom_line(aes(x = data_count, y = metric),color = 'red') + geom_line(aes(x = data_count_t, y = metric_t),color = 'blue') +
labs(title="Random Forest through data training size",
#subtitle="Drawn from Long Data format",
caption="Source: CTU-13",
y="F1 Score",
color=NULL)
rf_output_5 <- rf_result_5$output
rf_output_t_5 <- rf_result_5$output_t
rf_output_5
gg <- ggplot(data = rf_output_5)
gg + geom_line(aes(x = data_count, y = metric),color = 'red') +
labs(title="Random Forest through data training size",
#subtitle="Drawn from Long Data format",
caption="Source: CTU-13",
y="F1 Score",
color=NULL)
rf_first_training_sample <- rf_training.sampled_5[1:200,]
ggplot(rf_first_training_sample) + geom_bar(aes(subclass))
rf_class_distribution <- rf_first_training_sample %>% group_by(class) %>% summarise(n = n()) %>% arrange(desc(n))
ggplot(rf_first_training_sample) + geom_bar(aes(class)) + theme(axis.text.x = element_text(angle = 90, hjust = 1))
#rf_testing_result_2 <- rf_result_2$testing_result
#testing_result
gg <- ggplot(data = rf_output_t_5)
gg + geom_line(aes(x = data_count, y = metric),color = 'blue') +
labs(title="Random Forest through data training size",
#subtitle="Drawn from Long Data format",
caption="Source: CTU-13",
y="F1 Score",
color=NULL)
rf_output_t_aux_5 <- rf_output_t_5
names(rf_output_t_aux_5) <- c('data_count_t','metric_t')
rf_output_result_5 <- cbind(rf_output_5,rf_output_t_aux_5)
gg <- ggplot(data = rf_output_result_5)
gg + geom_line(aes(x = data_count, y = metric),color = 'red') + geom_line(aes(x = data_count_t, y = metric_t),color = 'blue') +
labs(title="Random Forest through data training size",
#subtitle="Drawn from Long Data format",
caption="Source: CTU-13",
y="F1 Score",
color=NULL)
first_training_sample <- training.sampled[1:200,]
first_training_sample
ggplot(first_training_sample) + geom_bar(aes(subclass))
class_distribution <- first_training_sample %>% group_by(class) %>% summarise(n = n()) %>% arrange(desc(n))
ggplot(first_training_sample) + geom_bar(aes(class)) + theme(axis.text.x = element_text(angle = 90, hjust = 1))
set.seed(226)
size_training <- nrow(training)
training.sampled <- training[sample(size_training, size_training), ]
aux_training_set <- training.sampled[c(1:200), ]#training[sample(size_training, 200), ]
clusters <- kmeans(aux_training_set[,-c(11,12,13,14)],3,nstart = 25)
aux_training_set_cluster <- cbind(aux_training_set, cluster = clusters$cluster)
result_vector <- numeric(nrow(testing))
result_vector_trainning <- numeric(nrow(aux_training_set))
for (j in c(1:3)){
cluster_data <- aux_training_set_cluster %>% filter(cluster == j)
new_rfFit <- train(subclass ~ sp+wp+wnp+snp+ds+dm+dl+ss+sm+sl,
data = cluster_data,
metric="ROC",
method = "rf",
trControl = ctrl_fast)
predsrfprobs <- predict(new_rfFit,testing,type='prob')
for (k in c(1:length(result_vector))){
if(predsrfprobs$botnet[k] > 0.5){
result_vector[k] <- result_vector[k] + 1
}
else{
result_vector[k] <- result_vector[k] - 1
}
}
#Trainning predict
predsrfprobs_t <- predict(new_rfFit,aux_training_set,type='prob')
for (k in c(1:length(result_vector_trainning))){
if(predsrfprobs_t$botnet[k] > 0.5){
result_vector_trainning[k] <- result_vector_trainning[k] + 1
}
else{
result_vector_trainning[k] <- result_vector_trainning[k] - 1
}
}
}
a = ifelse(result_vector > 0,'botnet','normal')
b <- ifelse(result_vector_trainning > 0,'botnet','normal')
cm <- confusionMatrix(a,testing$subclass)
metric <- cm$byClass['F1']#cm$overall[1]
metric
cm_t <- confusionMatrix(b,aux_training_set$subclass)
metric_t <- cm_t$byClass['F1']
metric_t
set.seed(556)
a = c(1,2,3,4,5,6,7,8,9)
r <- sample(9,3)
a[r]
r2 <- sample(9,3)
a[r2]
#testing_result
testing_result.bkp <- testing_result
testing_result
names_aux <- foreach(i=1:(nrow(training)/200)) %do% {
iteration <- 200 * i
paste('size_',toString(iteration),sep = "")
}
testing_result_names <- unlist(names_aux, use.names=FALSE)
testing_result <- testing_result[,c(-1)]
names(testing_result) <- testing_result_names
testing_result
testing_aux <- cbind(testing,testing_result)
testing_aux.bkp2 <- testing_aux
#write.table(testing_aux,file="testing_cluster_result.txt",sep="|", row.names = F)
testing_aux
sums <- rowSums(testing_aux[,-c(1:14)])
sums
testing_aux[,-c(1:14)]
testing_aux <- cbind(testing_aux,sums)
testing_aux
testing_aux_result <- testing_aux %>% group_by(class) %>% summarise(n = n(), sums = sum(sums)) %>% arrange(desc(sums))
testing_aux_result
graph_testing_result <- ggplot(testing_aux_result[-c(1,nrow(testing_aux_result)),])
graph_testing_result + geom_point(aes(class,sums)) + theme(axis.text.x = element_text(angle = 90, hjust = 1))
feature_vectors_cleaned
library(gridExtra)
pdf("data_output.pdf", height=11, width=8.5)
grid.table(feature_vectors_cleaned[1:20,])
dev.off()
testing_result.bkp
testing_aux.bkp2
testing_aux_result
rusty_data_result <- testing_aux.bkp2
rusty_data_result_short <- rusty_data_result[,-c(1:11,14)]
rusty_data_result_short[,-c(1,2)]
rusty_data_result_short$pos <- rowSums(rusty_data_result_short[,-c(1,2)] > 0)
rusty_data_result_short$neg <- rowSums(rusty_data_result_short[,-c(1,2)] < 0)
rusty_data_result_short_cleaned <- rusty_data_result_short[,c(1,2,46,47)]
rusty_data_result_short_cleaned
rusty_data_result_short_cleaned_result <- rusty_data_result_short_cleaned %>% mutate(good = ifelse(subclass == 'normal',neg,pos))
rusty_data_result_short_cleaned_result <- rusty_data_result_short_cleaned_result %>% mutate(bad = ifelse(subclass == 'normal',pos,neg))
rusty_data_result_short_cleaned_result %>% group_by(port) %>% summarise(n=n(),good = sum(good),bad = sum(bad)) %>% arrange(desc(n))
data_botnet_port <- rusty_data_result_short_cleaned_result %>% filter(subclass == 'botnet')
data_normal_port <- rusty_data_result_short_cleaned_result %>% filter(subclass == 'normal')
data_botnet_port_result <- data_botnet_port %>% group_by(port) %>% summarise(n=n(),good = sum(good),bad = sum(bad)) %>% arrange(desc(n))
data_normal_port_result <- data_normal_port %>% group_by(port) %>% summarise(n=n(),good = sum(good),bad = sum(bad)) %>% arrange(desc(n))
data_botnet_port_result
data_normal_port_result
ggplot(data = data_botnet_port_result) +
geom_bar(mapping = aes(x = port, fill = clarity))
#write.table(data_botnet_port_result,file="data_botnet_port.txt",sep="|", row.names = F)
library(reshape2)
data <- data_botnet_port_result
data$port <- as.factor(data$port)
melt(data[,c(1,3,4)])
ggplot(melt(data[,c(1,3,4)]))+
geom_col(aes(x=port,y=value,fill=variable))+
#theme_bw()+
theme(axis.text.x = element_text(angle = 45, hjust = 1))
set.seed(101)
training.bkp <- training
noisy_data <- training
porcent <- nrow(training) / 5
training_noisy <- generate_data_noisy(noisy_data,porcent)
nrow(training)
nrow(training_noisy)
cm$overall[1]
Accuracy
0.9222042
rf_measures_result
[1] NA 0.92433697 NA 0.92199688 NA 0.92043682 NA 0.91341654
[9] NA 0.91809672 NA 0.91107644 NA 0.91575663 NA 0.91809672
[17] NA 0.90951638 NA 0.91185647 NA 0.91107644 NA 0.90249610
[25] NA 0.90405616 NA 0.90171607 NA 0.87051482 NA 0.88845554
[33] NA 0.86739470 NA 0.84243370 NA 0.85179407 NA 0.78549142
[41] NA 0.72074883 NA 0.74726989 NA 0.66458658 NA 0.58346334
[49] NA 0.48673947 NA 0.40483619 NA 0.30109204 NA 0.22308892
[57] NA 0.24258970 NA 0.18798752 NA 0.14664587 NA 0.13182527
[65] NA 0.12792512 NA 0.12090484 NA 0.12792512 NA 0.11310452
[73] NA 0.09438378 NA 0.09438378 NA 0.09282371 NA 0.08346334
[81] NA 0.09516381 NA 0.09438378 NA 0.08658346 NA 0.08112324
[89] NA 0.07878315
index <- seq(2,90,2)
measure_result <- rf_measures_result[index]
rla_measure_data <- data.frame(index,measure_result)
names(rla_measure_data) <- c('noise_porcent','balanced_accuracy')
g<-ggplot(rla_measure_data) + geom_point(mapping = aes(x = noise_porcent, y = balanced_accuracy)) + geom_smooth(mapping = aes(x = noise_porcent, y = balanced_accuracy))
ggplotly(g)
index <- seq(2,90,2)
measure_result <- ela_measures_result[index]
ela_measure_data <- data.frame(index,measure_result)
names(ela_measure_data) <- c('noise_porcent','ela_measure')
ggplot(ela_measure_data) + geom_point(mapping = aes(x = noise_porcent, y = ela_measure)) + geom_smooth(mapping = aes(x = noise_porcent, y = ela_measure))
result_ela_measure
Error: object 'result_ela_measure' not found
result_ela_measure.bkp
Error: object 'result_ela_measure.bkp' not found
prediction_vector
[1] "0" "0" "0.1428571" "0.1428571" "0" "0" "0.5714286"
[8] "0" "0" "0.5714286" "Normal" "From-Normal-Jist" "80" "tcp"
cs_data.result <- data.frame(rf_result_5$output$data_count)
#for(i in c(1:1)){
current_seed <- 226 #+ i
set.seed(current_seed)
size_training <- nrow(training)
cs_training.sampled_current <- training[sample(size_training, size_training), ]
split_size_training = size_training / 200
metric <- numeric(split_size_training)
for(j in 1:split_size_training){
count <- 200 * j
aux_training_set <- cs_training.sampled_current[c(1:count), ]
result <- c()
for(k in 1:nrow(testing)){
prediction_vector <- testing[k,]
prediction_vector <- as.vector(as.matrix(prediction_vector))
output_result <- prediction_by_similarity(aux_training_set,prediction_vector,101)
result[k] <- output_result
}
vector_result <- unlist(result)
cm <- confusionMatrix(vector_result,testing$class)
metric[j] <- cm$byClass['F1']
}
cs_data.result <- cbind(cs_data.result, metric)
#}
cs_data.result
for(i in c(1:30)){
current_seed <- 226 + i
set.seed(current_seed)
size_training <- nrow(training)
rf_training.sampled_current <- training[sample(size_training, size_training), ]
print_data = rf_training.sampled_current[,c(1:10,14)]
file_name = paste(paste("trainin_sample", i, sep = '_'), 'csv', sep = '.')
write.csv(print_data,file=file_name , row.names = F)
}
#write.csv(testing[,c(1:11)], file='testing.csv', row.names = F)
```